{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "be1c4fe9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d6d8e465",
   "metadata": {},
   "outputs": [],
   "source": [
    "#准备数据保存\n",
    "ids=[]\n",
    "names=[]\n",
    "codes=[]\n",
    "areas=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "eb8a23c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\anaconda\\envs\\laptop\\lib\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www.crcrfsp.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'<!doctype html>\\n<html>\\n<head>\\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\\n<meta http-equiv=\"pragma\" content=\"no-cache\" />\\n<meta http-equiv=\"cache-control\" content=\"no-cache\" />\\n<meta http-equiv=\"expires\" content=\"0\" />\\n\\n<!--style start-->\\n<link href=\"/common/HTMLTurbo/0.1.0/HTMLTurbo.css?versionDate=1703848745542\" rel=\"stylesheet\" type=\"text/css\" />\\n<link href=\"/portal/style/common.css?versionDate=1703848745542\" rel=\"stylesheet\" type=\"text/css\" />\\n<link href=\"/portal/style/layout.css?versionDate=1703848745542\" rel=\"stylesheet\" type=\"text/css\" />\\n<link href=\"/portal/base/userOrg/style/columnContent.css?versionDate=1703848745542\" rel=\"stylesheet\" type=\"text/css\" />\\n<!--style end-->\\n\\n<!--script start-->\\n<script src=\"/common/jquery/1.12.4/jquery.min.js?versionDate=1703848745542\"></script>\\n<!--[if lt IE 9]><script src=\"/common/html5/3.7.3/html5.js?versionDate=1703848745542\"></script><![endif]-->\\n<script src=\"/common/seajs/2.3.0/sea.js?versionDate=1703848745542\"></script>\\n<script src=\"/common/HTMLTurbo/edge/seaConfig.js?ts=1704784370044\"></script>\\n<!--script end-->\\n\\n</head>\\n<body data-source=\"formObject\" class=\"listPage\" >\\n<form name=\"searchForm\" id=\"searchForm\" method=\"post\" action=\"/org.do?userType=1&orgCatalog=20\">\\n  <div class=\"module searchModule\" id=\"searchModule\">\\n    <h4 class=\"titleBar\">按条件查询企业信息</h4>\\n    <table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" class=\"vTable\">\\n      <tr>\\n        <th>企业名称</th>\\n        <td><input type=\"text\" id=\"orgName\" name=\"orgName\" value=\"\"/></td>\\n      </tr>\\n      <tr>\\n        <th>组织机构代码</th>\\n        <td><input type=\"text\" id=\"orgCode\" name=\"orgCode\" value=\"\"/></td>\\n      </tr>\\n      <tr style=\"width:100%;\">\\n        <th>所属区域</th>\\n        <td style=\"width:70%;\"><select id=\"province\" name=\"province\">\\n\\t\\t\\t<option value=\"\"></option>\\n          </select>\\n          <select style=\"width:auto;\" id=\"city\" name=\"city\">\\n\\t\\t  \\t<option value=\"\"></option>\\n          </select>\\n          <select style=\"width:auto;\" id=\"county\" name=\"county\">\\n\\t\\t  \\t<option value=\"\"></option>\\n          </select></td>\\n      </tr>\\n    </table>\\n    <div class=\"toolbar\">\\n      <input type=\"submit\" value=\"查询\" />\\n      <input type=\"reset\" id=\"clearButton\" value=\"清空\" /><!-- bug#34010 要求清空后不查询-->\\n      <!--<a class=\"button\" id=\"resetButton\" href=\"/org.do?userType=1&orgCatalog=20\">清空</a>-->\\n       <!--<input type=\"reset\" value=\"清空\" /> -->\\n    </div>\\n  </div>\\n  </form>\\n  <form name=\"listForm\" id=\"listForm\" method=\"post\" action=\"/org.do?userType=1&orgCatalog=20\">\\n  <div class=\"module listModule\" id=\"listModule\">\\n    <h4 class=\"titleBar\">注册企业信息列表</h4>\\n    <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" class=\"hTable\" data-check=\"true\">\\n      <thead>\\n        <tr>\\n          <th>序号</th>\\n          <th>注册企业</th>\\n          <th>组织机构代码</th>\\n          <th>所属区域</th>\\n        </tr>\\n      </thead>\\n      <tbody>\\n  <tr>\\n    <td>1</td>\\n    <td>西北永新涂料有限公司</td>\\n    <td>05311823-X</td>\\n    <td>甘肃省-兰州市-城关区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>2</td>\\n    <td>汕头市正天药业有限公司</td>\\n    <td>19276891-0</td>\\n    <td>广东省-汕头市-龙湖区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>3</td>\\n    <td>杭州银行股份有限公司温州分行</td>\\n    <td>57930491-8</td>\\n    <td>浙江省-温州市-鹿城区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>4</td>\\n    <td>四川明宏光学科技有限公司</td>\\n    <td>91511424MA62J1LT75</td>\\n    <td>四川省-眉山市-丹棱县</td>\\n  </tr>\\n\\n  <tr>\\n    <td>5</td>\\n    <td>泸州鑫天忠泰商贸有限公司</td>\\n    <td>06676683-4</td>\\n    <td>四川省-泸州市-江阳区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>6</td>\\n    <td>十四冶建设云南勘察设计有限公司</td>\\n    <td>78169986-1</td>\\n    <td>云南省-昆明市-五华区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>7</td>\\n    <td>十四冶建设集团有限公司</td>\\n    <td>75715587-2</td>\\n    <td>云南省-昆明市-五华区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>8</td>\\n    <td>天津空畅汽车贸易有限公司</td>\\n    <td>69066253-7</td>\\n    <td>天津市-市辖区-滨海新区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>9</td>\\n    <td>青岛世骏信进汽车销售服务有限公司</td>\\n    <td>58781968-X</td>\\n    <td>山东省-青岛市-城阳区</td>\\n  </tr>\\n\\n  <tr>\\n    <td>10</td>\\n    <td>天津市天物中辰汽车发展有限公司</td>\\n    <td>68474484-3</td>\\n    <td>天津市-市辖区-滨海新区</td>\\n  </tr>\\n</tbody>\\n    </table>\\n    <div id=\"pagination\" class=\"ht_pagination\"><ul class=\"pageList\"><li class=\"first {current}\" title=\"跳转到第：1页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"1\" /></li><li class=\"prev {current}\" title=\"跳转到第：1页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"1\" /></li><li class=\"current {current}\" title=\"跳转到第：1页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"1\" /></li><li class=\" {current}\" title=\"跳转到第：2页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"2\" /></li><li class=\" {current}\" title=\"跳转到第：3页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"3\" /></li><li class=\" {current}\" title=\"跳转到第：4页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"4\" /></li><li class=\" {current}\" title=\"跳转到第：5页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"5\" /></li><li class=\" {current}\" title=\"跳转到第：6页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"6\" /></li><li class=\" {current}\" title=\"跳转到第：7页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"7\" /></li><li class=\" {current}\" title=\"跳转到第：8页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"8\" /></li><li class=\" {current}\" title=\"跳转到第：9页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"9\" /></li><li class=\" {current}\" title=\"跳转到第：10页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"10\" /></li><li class=\"next {current}\" title=\"跳转到第：2页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"2\" /></li><li class=\"last {current}\" title=\"跳转到第：23433页\"><input type=\"submit\" name=\"submit_pageNum\" value=\"23433\" /></li></ul>\\r\\n<p>跳转到：<input type=\"text\" name=\"pageNum\" id=\"pageNum\" size=\"3\" maxlength=\"9\" class=\"pageNum\" value=\"1\"/><input type=\"submit\" class=\"submitButton\" value=\"GO\"/><span class=\"tips\">当前第 1 页/总 23433 页，共有 <span id=\"infoTotal\">234321</span> 条信息 <input type=\"hidden\" name=\"pageTotal\" id=\"pageTotal\" value=\"23433\"/></span></p></div>\\n  </div>\\n</form>\\n</body >\\n</html>\\n\\n\\n<script type=\"text/javascript\">\\nseajs.use(\"portal/base/userOrg/script/ListPage\");\\n</script>\\n'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url=r\"https://www.crcrfsp.com/org.do?userType=1&orgCatalog=20&pageNum=1&pageTotal=23280\"\n",
    "r=requests.get(url=url,verify=False)\n",
    "print(r.status_code)\n",
    "#爬取到的网页内容\n",
    "r.text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "402abaad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],\n",
       " ['西北永新涂料有限公司',\n",
       "  '汕头市正天药业有限公司',\n",
       "  '杭州银行股份有限公司温州分行',\n",
       "  '四川明宏光学科技有限公司',\n",
       "  '泸州鑫天忠泰商贸有限公司',\n",
       "  '十四冶建设云南勘察设计有限公司',\n",
       "  '十四冶建设集团有限公司',\n",
       "  '天津空畅汽车贸易有限公司',\n",
       "  '青岛世骏信进汽车销售服务有限公司',\n",
       "  '天津市天物中辰汽车发展有限公司'],\n",
       " ['05311823-X',\n",
       "  '19276891-0',\n",
       "  '57930491-8',\n",
       "  '91511424MA62J1LT75',\n",
       "  '06676683-4',\n",
       "  '78169986-1',\n",
       "  '75715587-2',\n",
       "  '69066253-7',\n",
       "  '58781968-X',\n",
       "  '68474484-3'],\n",
       " ['甘肃省-兰州市-城关区',\n",
       "  '广东省-汕头市-龙湖区',\n",
       "  '浙江省-温州市-鹿城区',\n",
       "  '四川省-眉山市-丹棱县',\n",
       "  '四川省-泸州市-江阳区',\n",
       "  '云南省-昆明市-五华区',\n",
       "  '云南省-昆明市-五华区',\n",
       "  '天津市-市辖区-滨海新区',\n",
       "  '山东省-青岛市-城阳区',\n",
       "  '天津市-市辖区-滨海新区']]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#解析网页\n",
    "html=r.text\n",
    "soup=BeautifulSoup(html,'html.parser')\n",
    "#找到全部企业信息\n",
    "thread=soup.find(name='form',attrs={\"name\":\"listForm\",\"id\":\"listForm\" ,\n",
    "                                    \"method\":\"post\",\n",
    "                                    \"action\":\"/org.do?userType=1&orgCatalog=20\"})\n",
    "items=thread.find_all(name=\"tr\")\n",
    "\n",
    "for item in items:\n",
    "    elements=item.find_all(name=\"td\")\n",
    "    if elements!=[]:\n",
    "        ids.append(elements[0].string)\n",
    "        names.append(elements[1].string)\n",
    "        codes.append(elements[2].string)\n",
    "        areas.append(elements[3].string)\n",
    "all_imformation=[ids,names,codes,areas]\n",
    "all_imformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "631ec8bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "#输出到excel\n",
    "data=pd.DataFrame([ids,names,codes,areas])\n",
    "data=data.T\n",
    "data.columns=['ids','names','codes','areas']\n",
    "data.to_excel('E:\\微信公众号\\程序\\爬虫\\上传版本\\jupyter\\sample.xlsx')\n",
    "data2=pd.read_excel('E:\\微信公众号\\程序\\爬虫\\上传版本\\jupyter\\sample.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2cb6aad",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
